pew %>% gather(2:7, key = "income_bracket", value = "count")
billboard %>% gather(6:81, key = "week", value = "rank")
weather %>% gather(d1:d8, key = "day", value = "temp") %>% spread(key = "element", value = "temp")
separate()ed.unite()ed.A
## # A tibble: 6 x 3 ## country year rate ## * <chr> <int> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583
A %>%
separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <chr> <chr> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
A %>%
separate(rate, into = c("cases", "population"), sep = "/")
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <chr> <chr> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
A %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
This is a less common operation than separate() (though it is needed for Exercise 5.7). Imagine you were working with this (butchered) form of data set C.
C2 <- C %>%
separate(year, into = c("century", "year_in_century"), sep = 2)
C2
## # A tibble: 6 x 5 ## country century year_in_century cases population ## <chr> <chr> <chr> <int> <int> ## 1 Afghanistan 19 99 745 19987071 ## 2 Afghanistan 20 00 2666 20595360 ## 3 Brazil 19 99 37737 172006362 ## 4 Brazil 20 00 80488 174504898 ## 5 China 19 99 212258 1272915272 ## 6 China 20 00 213766 1280428583
We need to unite() the century and year_in_century columns.
C2 %>% unite(century, year_in_century, col = "year", sep = "")
## # A tibble: 6 x 4 ## country year cases population ## <chr> <chr> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
gather() the columns into values.spread()ing the values across the columnsseparate()ed.unite()ed.Do you ever find yourself with .Rmd files that look like this?
my_df1 %>% ... # do some stuff to my_df1 ... my_df2 %>% ... # do the same stuff to my_df2 ... my_df3 %>% ... # and again to my_df3 ...
What if I want to draw the same kind of plot several times?
my_df1 %>% ggplot(aes(x = var1, y = var2, color = var3)) + geom_point() + geom_line() my_df2 %>% ggplot(aes(x = varA, y = varB, color = varC)) + geom_point() + geom_line() my_df3 %>% ggplot(aes(x = var1A, y = var2B, color = var3C)) + geom_point() + geom_line()
name_of_function <- function(data, var = "value") {
. . .
. . .
<valid R code>
. . .
. . .
return(x)
}
data, var
data is requiredvar is optional - has a default value of"value"`xlibrary(tidyverse)
my_cars <- function(mod) {
mpg %>%
filter(model == mod)
}
my_cars("protege")
## # A tibble: 0 x 11 ## # ... with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>, ## # year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>, ## # fl <chr>, class <chr>
my_cars <- function(mod = "civic") {
mpg %>%
filter(model == mod)
}
my_cars()
## # A tibble: 9 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 honda civic 1.60 1999 4 manu… f 28 33 r subc… ## 2 honda civic 1.60 1999 4 auto… f 24 32 r subc… ## 3 honda civic 1.60 1999 4 manu… f 25 32 r subc… ## 4 honda civic 1.60 1999 4 manu… f 23 29 p subc… ## 5 honda civic 1.60 1999 4 auto… f 24 32 r subc… ## 6 honda civic 1.80 2008 4 manu… f 26 34 r subc… ## 7 honda civic 1.80 2008 4 auto… f 25 36 r subc… ## 8 honda civic 1.80 2008 4 auto… f 24 36 c subc… ## 9 honda civic 2.00 2008 4 manu… f 21 29 p subc…
my_cars("jetta")
## # A tibble: 9 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 volkswagen jetta 1.90 1999 4 manu… f 33 44 d comp… ## 2 volkswagen jetta 2.00 1999 4 manu… f 21 29 r comp… ## 3 volkswagen jetta 2.00 1999 4 auto… f 19 26 r comp… ## 4 volkswagen jetta 2.00 2008 4 auto… f 22 29 p comp… ## 5 volkswagen jetta 2.00 2008 4 manu… f 21 29 p comp… ## 6 volkswagen jetta 2.50 2008 5 auto… f 21 29 r comp… ## 7 volkswagen jetta 2.50 2008 5 manu… f 21 29 r comp… ## 8 volkswagen jetta 2.80 1999 6 auto… f 16 23 r comp… ## 9 volkswagen jetta 2.80 1999 6 manu… f 17 24 r comp…
my_cars("camry") %>%
head(2)
## # A tibble: 2 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 toyota camry 2.20 1999 4 manu… f 21 29 r mids… ## 2 toyota camry 2.20 1999 4 auto… f 21 27 r mids…
my_cars(mod = "corolla") %>% head(2)
## # A tibble: 2 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 toyota coro… 1.80 1999 4 auto… f 24 30 r comp… ## 2 toyota coro… 1.80 1999 4 auto… f 24 33 r comp…
Pay attention to:
What does this do?
most_popular_year <- function(data, name_arg) {
data %>%
filter(name == name_arg) %>%
group_by(year) %>%
summarize(total = sum(prop)) %>%
arrange(desc(total)) %>%
head(1) %>%
select(year)
}
library(babynames) most_popular_year(data = babynames, name_arg = "Andrew")
## # A tibble: 1 x 1 ## year ## <dbl> ## 1 1987
most_popular_year(babynames, "Andrew")
## # A tibble: 1 x 1 ## year ## <dbl> ## 1 1987
# most_popular_year("Andrew")
For the following exercises, use the pnwflights14 dataset.